*==============================================================================*
*===*                  Overconfidence and Gun Preferences                  *===*
*===*                         * Cleaning dataset *                         *===*
*==============================================================================*

* Authors: Fernando G. Cafferata, Patricio Dominguez, and Carlos Scartascini
* Code written by Fernando G. Cafferata and Matias Guizzo Altube
* Edited by: Andres Barinas-Forero

****===========================================================================*
**#*===* 0. Preamble

* Clearing memory
	clear all
	clear mata
	clear matrix
	set more off

* Defining paths
	global today = strtrim(c(current_date))
	global path "C:/Users/afelipeb/OneDrive - Inter-American Development Bank Group/IADB Andres Barinas/Overconfidence_Guns/Replication Package" // This global should be changed with the personal directory in order to use the other globals (the ones are below)
	global data "$path/Data"
	global output "$path/Output"

	
****===========================================================================*
**#*===* 1. Appending individual country datasets

// use "$data\AR_iadb_remedial_v20200110", clear /* 262 variables y 1119 observaciones */
// use "$data\AR_iadb_v20200110", clear /* 322 variables y 1246 observaciones */
// use "$data\BR_iadb_v20200110", clear /* 341 variables y 1300 observaciones */
// use "$data\CL_iadb_v20200110", clear /* 324 variables y 1280 observaciones */
// use "$data\CO_iadb_v20200110", clear /* 329 variables y 1300 observaciones */
// use "$data\MX_iadb_v20200110", clear /* 343 variables y 1299 observaciones */
// use "$data\US_iadb_v20191105", clear /* 196 variables y 1000 observaciones */

use "$data\AR_iadb_remedial_v20200110", clear /* Pais=0*/ 
append using "$data\AR_iadb_v20200110", generate(PAIS) force /* Pais=1*/
append using "$data\BR_iadb_v20200110", force /* Pais=2*/
replace PAIS=2 if PAIS==.
append using "$data\CL_iadb_v20200110", force /* Pais=3*/
replace PAIS=3 if PAIS==.
append using "$data\CO_iadb_v20200110", force /* Pais=4*/
replace PAIS=4 if PAIS==.
append using "$data\MX_iadb_v20200110", force /* Pais=5*/
replace PAIS=5 if PAIS==.
append using "$data\US_iadb_v20191105", force /* Pais=6*/
replace PAIS=6 if PAIS==.

	label var PAIS "Country"
	lab define l_pais 0 "Argentina_1" 1 "Argentina_2" 2 "Brasil" 3 "Chile" 4 "Colombia" 5 "México" 6 "EE.UU.", modify
	lab val PAIS l_pais

replace country="US" if country==""

*** Argentina_1 is the rigth database for Argentina, so we will drop observations wich belong to Argentina_2 ***

drop if PAIS==1

replace PAIS=1 if PAIS==0
	lab define l_pais 1 "Argentina" 2 "Brasil" 3 "Chile" 4 "Colombia" 5 "México" 6 "EE.UU.", modify
	lab val PAIS l_pais

// save "$data\Overconfidence_Merge.dta", replace

*************************************
**# Generating Treatment variables (Carlos)
*************************************

* TREATMENT NEWS
gen T_News=news_t !=. /*Leaves aside de US dataset, cause there is no news_t */
	label var T_News "Treatment news = 1"

* TREATMENT DATA
gen T_Data=info2_t !=. /*Leaves aside de US dataset, cause there is no news_t */
	label var T_Data "Treatment actual data = 1"	

* TREATMENT (Fernando) 
* News treatment
gen t_news=(info_condition==1) /*Create a dummy variable for the treatment when Questionare A was applied */
	label var t_news "News treatment = 1"

* Data Treatment
gen t_data=(info_condition==1 | info_condition==2) /*Create a dummy variable for the treatment when Questionare A or B was applied */
	label var t_data "Data Treatment = 1"

global treats "T_News T_Data t_news t_data"


**********************************************
**# Generate Over and Under Estimation Variables
**********************************************

* cleaning the outliers
bys PAIS: egen StDev=sd(hrest) 
	label var StDev "SD of estimate by country"
bys PAIS: egen P99=pctile(hrest), p(99) 
	label var P99 "Perc 99 of estimate by country"
bys PAIS: gen HREST_UO=hrest if hrest<P99
	label var HREST_UO "Estimate w/o the top 99% to eliminate outliers"
bys PAIS: egen StDev_UO=sd(HREST_UO) 
	label var StDev_UO "SD of estimate by country w/o outlier"

* Crime Level by Country
gen CrimeLevel=0
	replace CrimeLevel=50 if PAIS==0 
	replace CrimeLevel=50 if PAIS==1
	replace CrimeLevel=1200 if PAIS==2
	replace CrimeLevel=12 if PAIS==3
	replace CrimeLevel=240 if PAIS==4
	replace CrimeLevel=470 if PAIS==5
	replace CrimeLevel=330 if PAIS==6 /* for the US is 330 according to the survey file */
	label var CrimeLevel "Crime level in each country (admin data)"

* Overestimation of crime (dummy)
gen Over=0 if hrest!=.
	by PAIS: replace Over=1 if hrest>CrimeLevel
	label var Over "Individual over estimated crime levels in her country"

* Underestimation of crime (dummy)
gen Under=0 if hrest!=.
	by PAIS: replace Under=1 if hrest<CrimeLevel & hrest!=.
	label var Under "Individual Under estimated crime levels in her country"
  
*Exact estimate  
gen Exact=0 if hrest!=.
	by PAIS: replace Exact=1 if hrest==CrimeLevel & hrest!=.
	label var Exact "Individual estimated crime levels in her country exactly"

* Overestimation of crime (dummy) + 1 sd away (w/o outliers)
gen OverSD=0 if hrest!=.
	by PAIS: replace OverSD=1 if hrest>CrimeLevel+StDev_UO & hrest!=.
	label var OverSD "Individual over estimated crime levels in her country by more than a SD"

* Underestimation of crime (dummy) + 1 sd away (w/o outliers)
gen UnderSD=0 if hrest!=.
	by PAIS: replace UnderSD=1 if hrest<CrimeLevel-StDev_UO & hrest!=.
	label var UnderSD "Individual under estimated crime levels in her country by more than a SD"

* Overestimation of crime (dummy) + 1/3 sd away (w/o outliers)
gen OverSD_3=0 if hrest!=.
	by PAIS: replace OverSD_3=1 if hrest>CrimeLevel+((1/3)*StDev_UO) & hrest!=.
	label var OverSD_3 "Individual over estimated crime levels in her country by more than a 1/3*SD"

* Underestimation of crime (dummy) + 1/3 sd away (w/o outliers)
gen UnderSD_3=0 if hrest!=.
	by PAIS: replace UnderSD_3=1 if hrest<CrimeLevel-((1/3)*StDev_UO) & hrest!=.
	label var UnderSD_3 "Individual under estimated crime levels in her country by more than a 1/3*SD"

* Generate Distance variables between estimation and actual crime

gen CrimeDist=hrest-CrimeLevel
	label var CrimeDist "Crime estimate minus crime level in the country"

gen CrimeDist_UO=HREST_UO-CrimeLevel
	label var CrimeDist_UO "Crime estimate minus crime level in the country w/o top 99 perc"

gen CrimeDist2=(hrest-CrimeLevel)^2
	label var CrimeDist2 "Crime estimate minus crime level in the country, squared"

gen CrimeDist2_UO=(HREST_UO-CrimeLevel)^2
	label var CrimeDist2_UO "Crime estimate minus crime level in the country, squared w/o top 99 perc"

gen CrimeShare=hrest/CrimeLevel 
	label var CrimeShare "Crime estimate over crime level in the country"

gen CrimeShare_UO=HREST_UO/CrimeLevel 
	label var CrimeShare_UO "Crime estimate over crime level in the country w/o top 99 perc"

*******************************
**# Generate Cognitive Abilities
*******************************

gen OK_Math=lq1_2==3
	replace OK_Math=. if lq1_2==.
	label var OK_Math "Individual responded correctly Math question"

gen OK_Tri=lq2_2==3
	replace OK_Tri=. if lq2_2==.
	label var OK_Tri "Individual responded correctly Tri question"

gen OK_SQ=qcon1==4
	replace OK_SQ=. if qcon1==.
	label var OK_SQ "Individual responded correctly SQ question"

gen OK_MT=0 if OK_Math!=. & OK_Tri!=.
	replace OK_MT=1 if OK_Math==1 & OK_Tri==1
	label var OK_MT "Individual responded correctly both Math and Triangle questions"

** Here you missed Square variable, will do an additional one including it**
// No one that responden the previous 2 ok made the 3rd wrong**

gen OK_MT_complete=0 if OK_Math!=. & OK_Tri!=. & OK_SQ!=.
	replace OK_MT_complete=1 if OK_Math==1 & OK_Tri==1 & OK_SQ==1
	label var OK_MT_complete "Individual responded correctly both Math, Triangle and Square questions"

****************************
**# Generate Policy Variables
****************************

* Distribution of coins
ren iace1e1 PenaltyCoins
ren iace1f1 DetectionCoins
ren iace1h1 PoliceCoins
ren iace1g1 PreventCoins


* Share variables
gen Prevent_Share=PreventCoins/(PenaltyCoins+DetectionCoins+PoliceCoins+PreventCoins)
gen Penalty_Share=PenaltyCoins/(PreventCoins+DetectionCoins+PoliceCoins+PenaltyCoins)
gen Detection_Share=DetectionCoins/(PreventCoins+DetectionCoins+PoliceCoins+PenaltyCoins)
gen Police_Share=PoliceCoins/(PreventCoins+DetectionCoins+PoliceCoins+PenaltyCoins)

* Substraction variables
gen Pen_Prev=PenaltyCoins-PreventCoins
gen Pen_Det=PenaltyCoins-DetectionCoins
gen Pen_Pol=PenaltyCoins-PoliceCoins
gen Pen_Rest=PenaltyCoins-(PreventCoins+DetectionCoins+PoliceCoins)
gen Prev_Rest=PreventCoins-(PenaltyCoins+DetectionCoins+PoliceCoins)

* Other Policy variables
gen Police=i_g1==1 if i_g1 != .
	lab var Police "People prefer more resources to the police rather than subsidies for private security"

gen HotSpot=i_g6a==1 if i_g6a != .
	lab var HotSpot "People prefer HotSpot rather than equal distribution of resources"

gen Taxes=i_g7==1 if i_g7 != .
	lab var Taxes "People prefer higher taxes for more security rather than lower taxes"

****************************
**#**** TRUST VARIABLES*******
****************************

* Ultimatum dummy
gen UG_Y=ug==1
	lab var UG_Y "Accepted Ultimatum Offer"

* Creating the sum of points
gen TGSUM=(10-tg1)+(12-tg2)+(10-dg1)+(3*UG_Y)
	lab var TGSUM "Number of points collected after games"

gen TGSUM_MAX=10+12+10+3
	lab var TGSUM_MAX "Max number of points possibles to collect"

* Drop all the variables about clicks and the like
/* I wasn't able to find these variables with accuracy, so I didn't drop them */
*drop *First_Click *Last_Click *Click_Count *Page_Submit 

****************************
**# Overconfidence Variables
****************************

// Do let me know how you want to handle these variables
// Proposal: This variable can take values from 9 to 45, since we have 9 specific questions with values from 1 to 4 and one more question with values from 0 to 9
// Also we could recode the variables used here in order to put 0 to those which take a value of 1 for ocon1c to ocon9c, value=1 means that the individual
// has no confidence in his answer

gen total_ocon=ocon1c+ocon2c+ocon3c+ocon4c+ocon5c+ocon6c+ocon7c+ocon8c+ocon9c+ocon10

// Overconfidence Index

gen OK_Ocon_1=ocon1==3
	replace OK_Ocon_1=. if ocon1==.
	label var OK_Ocon_1 "Individual responded correctly Q1 of Section 9"

gen OK_Ocon_2=ocon2==4
	replace OK_Ocon_2=. if ocon2==.
	label var OK_Ocon_2 "Individual responded correctly Q2 of Section 9"

gen OK_Ocon_3=ocon3==2
	replace OK_Ocon_3=. if ocon3==.
	label var OK_Ocon_3 "Individual responded correctly Q3 of Section 9"

gen OK_Ocon_4=ocon4==2
	replace OK_Ocon_4=. if ocon4==.
	label var OK_Ocon_4 "Individual responded correctly Q4 of Section 9"

gen OK_Ocon_5=ocon5==1
	replace OK_Ocon_5=. if ocon5==.
	label var OK_Ocon_5 "Individual responded correctly Q5 of Section 9"

gen OK_Ocon_6=ocon6==3
	replace OK_Ocon_6=. if ocon6==.
	label var OK_Ocon_6 "Individual responded correctly Q6 of Section 9"

gen OK_Ocon_7=ocon7==2
	replace OK_Ocon_7=. if ocon7==.
	label var OK_Ocon_7 "Individual responded correctly Q7 of Section 9"

gen OK_Ocon_8=ocon8==2
	replace OK_Ocon_8=. if ocon8==.
	label var OK_Ocon_8 "Individual responded correctly Q8 of Section 9"

gen OK_Ocon_9=ocon9==3
	replace OK_Ocon_9=. if ocon9==.
	label var OK_Ocon_9 "Individual responded correctly Q9 of Section 9"

// We put as missing values those observations where individuals were not willing to answer without external help

global ocon_vars "OK_Ocon_1 OK_Ocon_2 OK_Ocon_3 OK_Ocon_4 OK_Ocon_5 OK_Ocon_6 OK_Ocon_7 OK_Ocon_8 OK_Ocon_9"

foreach var of varlist $ocon_vars {
	replace `var'=. if oconcom==2
}

// Recoding above variables from 0 to -1

foreach var of varlist $ocon_vars {
	recode `var' (0=-1), generate(`var'_N)
}

// Generating overconfidence index, which goes from -36 to 36. If the individual gave a wrong answer, he obtains a -1 in the OK_Ocon_#_N variables.
// After that, we calculate the product of each of these variables by the variables corresponding to each of them which express how confident feels the individual about his answer.
// Since there 9 of these questions, in the worst case, an individual who responded wrong all the questions and feels absolutely sure about his answer will get a -36 in the index.

gen Overconfidence_Index=(OK_Ocon_1_N*ocon1c)+(OK_Ocon_2_N*ocon2c)+(OK_Ocon_3_N*ocon3c) ///
+(OK_Ocon_4_N*ocon4c)+(OK_Ocon_5_N*ocon5c)+(OK_Ocon_6_N*ocon6c)+(OK_Ocon_7_N*ocon7c)+(OK_Ocon_8_N*ocon8c)+(OK_Ocon_9_N*ocon9c)
	label var Overconfidence_Index "Overconfidence index *issue with 1"

// However, since 1 indicates no confidence in the individual's answer, we recode variables ocon#c:

global ocon_conf "ocon1c ocon2c ocon3c ocon4c ocon5c ocon6c ocon7c ocon8c ocon9c"

foreach var of varlist $ocon_conf {
	recode `var' (1=0) (2=1) (3=2) (4=3), generate(`var'_R)
}

gen Overconfidence_Index_R=(OK_Ocon_1_N*ocon1c_R)+(OK_Ocon_2_N*ocon2c_R)+(OK_Ocon_3_N*ocon3c_R) ///
+(OK_Ocon_4_N*ocon4c_R)+(OK_Ocon_5_N*ocon5c_R)+(OK_Ocon_6_N*ocon6c_R)+(OK_Ocon_7_N*ocon7c_R)+(OK_Ocon_8_N*ocon8c_R)+(OK_Ocon_9_N*ocon9c_R)
	label var Overconfidence_Index_R "Overconfidence index remake"


// We can add to the index question number 10 (ocon10), which asks how many questions do the individual think that answered well. 

gen right_answers=OK_Ocon_1+OK_Ocon_2+OK_Ocon_3+OK_Ocon_4+OK_Ocon_5+OK_Ocon_6+OK_Ocon_7+OK_Ocon_8+OK_Ocon_9
	label var right_answers "Questions answered right"

gen overestimation=ocon10-right_answers 
	label var overestimation "Overestimation index: num"

gen overestim_dummy = overestimation > 0 if overestimation < .
	label var overestim_dummy "Overestimation dummy"

* Robustness check Referee 
gen overestim_dummy_2 = overestimation > 1 if overestimation < .	
replace overestim_dummy_2 = . if overestimation == 1
	label var overestim_dummy_2 "Overestimation dummy$^{2}$"
	
// Valures range from -9 to 9. Individuals who answered right all questions but think that answered all wrong will get -9, indicating low self-confidence
// Individuals who get a 9 answered all questions wrong, but think they answered all questiongs right.

**********************************
**#** INDEPENDENT VARIABLES ***
**********************************

* Female
gen Female=q1==2 if q1 != 3
	label define malefemale 0 "Male" 1 "Female"
	label value Female malefemale 

* AGE
gen Age=q2
	replace Age=77 if q2==777 
	replace Age=51 if q2==511
	replace Age=.  if q2>94
	replace Age=26 if q2==.26 // is not working in the replacement, something is wrong 
	replace Age=28 if q2==.28 // is not working in the replacement, something is wrong 
	replace Age=30 if q2==.3 // is not working in the replacement, something is wrong 
	replace Age=59 if q2==5.9 // is not working in the replacement, something is wrong 

/* It doesn't work because when using tab command we see the rounded number, but indeed .3 was .30000001 and so on */

* By hand editing
	replace Age = 59 in 2721
	replace Age = 30 in 85
	replace Age = 28 in 2803

tab Age /* With Ages equal to 2, 6, 11 and 17, we dropped the data */
drop if Age<18

* Education 
ren ed educ
	label var educ "Education"

replace educ = educ-1 if PAIS == 6 // In the US sample, the education codes begin at 1 instead of 0.

gen Elementary=0 if educ!=.
	replace Elementary=1 if educ>6 & educ!=.

gen HighSchool=0 if educ!=.
	replace HighSchool=1 if educ>11 & educ!=.

gen College=0 if educ!=.
	replace College=1 if educ>15 & educ!=.

* Trust

ren b13 trust_gov
ren b21a trust_pres
ren b10a trust_just
ren b18 trust_pol
ren i_it1bn trust_others
recode trust_others (8=.)

* Marital status

replace panel_p1=1 if panel_marstat==5 & PAIS==6
replace panel_p1=2 if panel_marstat==1 & PAIS==6 // unifing domestic/Civil parnership with Married as in Casado/a o viviendo en pareja
replace panel_p1=2 if panel_marstat==6 & PAIS==6 // unifing domestic/Civil parnership with Married as in Casado/a o viviendo en pareja
replace panel_p1=3 if panel_marstat==2 & PAIS==6
replace panel_p1=3 if panel_marstat==3 & PAIS==6
replace panel_p1=4 if panel_marstat==4 & PAIS==6

ren panel_p1 marital_status
ren panel_p2 pers_home /* not for US */
ren panel_p3 pers_charge /* not for US */

* Employment

gen d_employed=0
	replace d_employed=1 if panel_ar_laboral_situation==1 & PAIS==0
	replace d_employed=1 if panel_ar_laboral_situation==1 & PAIS==1
	replace d_employed=1 if panel_br_laboral_situation==1 & PAIS==2
	replace d_employed=1 if panel_br_laboral_situation==2 & PAIS==2
	replace d_employed=1 if panel_cl_laboral_situation==1 & PAIS==3
	replace d_employed=1 if panel_cl_laboral_situation==2 & PAIS==3
	replace d_employed=1 if panel_co_laboral_situation==1 & PAIS==4
	replace d_employed=1 if panel_co_laboral_situation==2 & PAIS==4
	replace d_employed=1 if panel_mx_laboral_situation==1 & PAIS==5
	replace d_employed=1 if panel_mx_laboral_situation==2 & PAIS==5
	replace d_employed=1 if panel_employ==1 & PAIS==6
	replace d_employed=1 if panel_employ==2 & PAIS==6
	label define d_employed 0 "desempleado" 1 "empleado", replace

gen d_retired=0
	replace d_retired=1 if panel_ar_laboral_situation==3 & PAIS==0
	replace d_retired=1 if panel_ar_laboral_situation==3 & PAIS==1
	replace d_retired=1 if panel_br_laboral_situation==7 & PAIS==2
	replace d_retired=1 if panel_cl_laboral_situation==6 & PAIS==3
	replace d_retired=1 if panel_co_laboral_situation==6 & PAIS==4
	replace d_retired=1 if panel_mx_laboral_situation==6 & PAIS==5
	replace d_retired=1 if panel_employ==5 & PAIS==6
	label define d_retired 0 "Otro" 1 "Jubilado/Pensionado/Retirado", replace

* Sex
ren q1 Sex

// Marital Status //

* Marital Status = Single

gen d_single=0
	replace d_single=1 if marital_status==1

* Marital Status= Married
gen d_married=0
	replace d_married=1 if marital_status==2

* Marital Status = Divorced
gen d_divorced=0
	replace d_divorced=1 if marital_status==3

* Marital Status = Widow
gen d_widow=0
	replace d_widow=1 if marital_status==4


* Satisfaction
ren ls3 satisfaction


*******************************************************************************
**# Recoding MOR variables // Currently more confidence is lower number value

* MOR 1

recode mor1 (2=0), gen (nmor1)
	drop if nmor1>2
	label define masmenos1 1"More" 0 "Less" 
	label values nmor1 masmenos1

* MOR 2,3,4

forvalues i=2/4 {
	recode mor`i' (1=4)(3=1), gen (nmor`i')
		label var nmor`i' "Recode MOR"
}

forvalues i=2/4 {
	recode nmor`i' (4=3) 
}

label define masmenos 3"More" 2"Equal" 1 "Less ", replace 
label values nmor2 nmor3 nmor4 masmenos


* Recoding in dummy variables // we code 1 if more--> overplacement

forvalues i=2/4 {
	recode nmor`i' (3=1)(2=0)(1=0), gen (d_nmor`i')
		label var d_nmor`i' "Dummy recode MOR"
}

gen d_nmor1=nmor1

label define masmenos2 1"More " 0"Equal or less ", replace 
label values d_nmor1 d_nmor2 d_nmor3 d_nmor4 masmenos2


// New variable: mean confidence in answer of general interest questions

egen total_trust=rowtotal(ocon1c ocon2c ocon3c ocon4c ocon5c ocon6c ocon7c ocon8c ocon9c)
egen mean_trust=rowmean(ocon1c ocon2c ocon3c ocon4c ocon5c ocon6c ocon7c ocon8c ocon9c)


********************************************************************************
**#********* RECODE ARM 7,8,9 // Opondria resistencia 
********************************************************************************
tab arm7 
tab arm8 
tab arm9
tab arm3
tab arm4
tab arm5
tab arm6

* Cleaning empty categories

recode arm6 (8=.)
recode arm7 arm8 (5=.)
recode arm9 (4=.)

* se transforman las categorias opondria resistencia si tuviese un arma o cuchillo por una categoria unificada

forvalues i=7/8 {
	recode arm`i' (1=0)(2=1)(3=1)(4=2)(5=.), gen (narm`i')
}
label define opondria 2"Always " 1"Sometimes"  0"Never ", replace 
label values narm7 narm8 opondria


gen narm9= arm9
	recode narm9 (4=.) (3=2) (2=1) (1=0)
	label define opondria 2"Always " 1"Sometimes"  0"Never ", replace 
	label values narm9 opondria


* se transforman en dummy opondria o no opondria resistencia

forvalues i=7/9 {
	recode narm`i' (0=0)(1=1)(2=1), gen (d_narm`i')
}
label define sino  1"Resistence: Yes" 0"Resistence: No", replace 
label values d_narm7 d_narm8 d_narm9  sino

* Robustness check 1: (18 sept) - Sometimes al cero

forvalues i=7/9 {
	recode narm`i' (0=0)(1=0)(2=1), gen (r1_d_narm`i')
}
label define sino  1"Resistence: Yes" 0"Resistence: No", replace 
label values d_narm7 d_narm8 d_narm9  sino


* transformar en dummy arm3-arm6

forvalues i=3/6 {
	recode arm`i' (1=0) (2=0) (3=0) (4=.) (5=1) (6=1) (7=1) (8=.), gen (d_narm`i')
}
label define disagree 0"Disagree" 1"Agree", replace 
lab values d_narm3 d_narm4 d_narm5 d_narm6 disagree


* Robustness check 1: (18 sept)  (4 - 7)
forvalues i=3/6 {
	recode arm`i' (1=0) (2=0) (3=0) (4=1) (5=1) (6=1) (7=1) (8=.), gen (r1_d_narm`i')
	lab var r1_d_narm`i' "Arm `i' - (4-7)"
}

lab values r1_d_narm3 r1_d_narm4 r1_d_narm5 r1_d_narm6 disagree
	
* Robustness check 2: (18 sept)  (6 - 7)
forvalues i=3/6 {
	recode arm`i' (1=0) (2=0) (3=0) (4=0) (5=0) (6=1) (7=1) (8=.), gen (r2_d_narm`i')
	lab var r1_d_narm`i' "Arm `i' - (6-7)"
}

lab values r2_d_narm3 r2_d_narm4 r2_d_narm5 r2_d_narm6 disagree

*****************************************************************************
**#********* Principal Component Analysis Dependent variables******************
*****************************************************************************

set scheme s1color

global vars "arm3 arm4 arm5 arm6 narm7 narm8 narm9"


// This is useful for interpretation https://online.stat.psu.edu/stat505/lesson/11/11.4

pca $vars, corr means
esttab e(L), csv
predict pc1 pc2 pc3
// estat smc
// estat loadings
// estat kmo
//
// loadingplot, components(2)
// graph export "$ONLINE\Graphs\Loadingplot_PCA.png", replace

*pcacoefsave using "$pca\pca_results_arm", replace

// twoway 	function Principal_Component_1 = normalden(x, 1.41e-09, 1.669515), recast(area) range(-3.171643  4.084943) ///
// 			title("Density PC1 (red) y PC2 (blue)", size(medium)) ///
// 			subtitle("(assuming normal distribution)", size (small)) fcolor(red%15) ytitle("Density") lcolor(red)  xtitle("") legend(label(1 "PC1")) ///
// 			|| ///
// 			function Principal_Component_2 = normalden(x, 2.21e-09, 1.23824), recast(area) fcolor(b%15) recast(area) range(-5.033438  2.462579) ///
// 			lcolor(blue) ///
// 			legend(label(2 "PC2"))  xtitle("") || ///
//
//
// graph export "$ONLINE\Graphs\PCA_DepVars.png", replace

			//We are keeping pc1 and pc2 as the eigenvalue is bigger than 1	
			// we could explore rotating the PC
ren pc1 Principal_Component_1
ren pc2 Principal_Component_2
drop pc3


********************************************************************************
********************************************************************************
********************************************************************************

*income
des panel_ar_p14 panel_br_p14 panel_cl_p14_2018 panel_co_p14 panel_mx_p14 panel_faminc_new

tab panel_ar_p14, nolab 
tab panel_br_p14, nolab  
tab panel_cl_p14_2018, nolab  /* Chile no tiene info de ingresos*/
tab panel_co_p14, nolab  
tab panel_mx_p14, nolab 
tab panel_faminc_new, nolab 

* remove missings

recode panel_ar_p14 (97=.)(98=.)(99=.), gen (income_ar) 
recode panel_br_p14  (97=.)(98=.)(99=.),gen (income_br)
recode panel_cl_p14_2018 (97=.)(98=.) (99=.), gen (income_cl) 
recode panel_co_p14 (97=.)(98=.) (99=.), gen (income_co)  
recode panel_mx_p14 (97=.)(98=.) (99=.), gen (income_mx)
recode panel_faminc_new (97=.)(98=.) (99=.),gen (income_us)

tabstat income_ar income_br income_cl income_co income_mx income_us, statistics( mean ) by(country) 
tabstat income_ar income_br income_cl income_co income_mx income_us, statistics( sd ) by(country) 

* unifying income variable around means

* Income level by Country / Argentina
gen level_income_ar=. 
egen stDev_income_ar=sd(income_ar) if PAIS==1
egen mean_income_ar=mean(income_ar) if PAIS==1
	replace level_income_ar=3 if income_ar>= mean_income_ar & PAIS==1
	replace level_income_ar=4 if income_ar>= mean_income_ar + stDev_income_ar & PAIS==1
	replace level_income_ar=2 if income_ar< mean_income_ar & PAIS==1
	replace level_income_ar=1 if income_ar< mean_income_ar - stDev_income_ar & PAIS==1 
tab income_ar level_income_ar 

* Income level by Country / Brazil
gen level_income_br=. 
egen stDev_income_br=sd(income_br) if PAIS==2
egen mean_income_br=mean(income_br) if PAIS==2
	replace level_income_br=3 if income_br>= mean_income_br & PAIS==2
	replace level_income_br=4 if income_br>= mean_income_br + stDev_income_br & PAIS==2
	replace level_income_br=2 if income_br< mean_income_br & PAIS==2
	replace level_income_br=1 if income_br< mean_income_br - stDev_income_br & PAIS==2
tab income_br level_income_br

* Income level by Country / Colombia
gen level_income_co=. 
egen stDev_income_co=sd(income_co) if PAIS==4
egen mean_income_co=mean(income_co) if PAIS==4
	replace level_income_co=3 if income_co>= mean_income_co & PAIS==4
	replace level_income_co=4 if income_co>= mean_income_co + stDev_income_co & PAIS==4
	replace level_income_co=2 if income_co< mean_income_co & PAIS==4
	replace level_income_co=1 if income_co< mean_income_co - stDev_income_co & PAIS==4
tab income_co level_income_co 

* Income level by Country / Mexico
gen level_income_mx=. 
egen stDev_income_mx=sd(income_mx) if PAIS==5
egen mean_income_mx=mean(income_mx) if PAIS==5
	replace level_income_mx=3 if income_mx>= mean_income_mx & PAIS==5
	replace level_income_mx=4 if income_mx>= mean_income_mx + stDev_income_mx & PAIS==5
	replace level_income_mx=2 if income_mx< mean_income_mx & PAIS==5
	replace level_income_mx=1 if income_mx< mean_income_mx - stDev_income_mx & PAIS==5
tab income_mx level_income_mx 

* Income level by Country / US
gen level_income_us=. 
egen stDev_income_us=sd(income_us) if PAIS==6
egen mean_income_us=mean(income_us) if PAIS==6
	replace level_income_us=3 if income_us>= mean_income_us & PAIS==6
	replace level_income_us=4 if income_us>= mean_income_us + stDev_income_us & PAIS==6
	replace level_income_us=2 if income_us< mean_income_us & PAIS==6
	replace level_income_us=1 if income_us< mean_income_us - stDev_income_us & PAIS==6
tab income_us level_income_us 

 
egen level_income= rowmean(level_income_us level_income_mx level_income_ar level_income_co level_income_br) 
	label var level_income "Income: Above and below std categories"
	label define level_income 1"2 SD below mean income" 2 "1 SD below mean income" 3 "1 SD above mean income" 4 "2 SD above mean income", replace
	label values level_income level_income

********************************************************************************
**# 3.3 Characterizing overconfidents: gender, age, education, country, over
	
gen overestimation_cat=. 
egen overestimation_stDev=sd(overestimation) 
egen overestimation_mean=mean(overestimation) 
	replace overestimation_cat=2 if overestimation>= overestimation_mean - overestimation_stDev & overestimation < .
	replace overestimation_cat=2 if overestimation<= overestimation_mean + overestimation_stDev
	replace overestimation_cat=3 if overestimation> overestimation_mean + overestimation_stDev & overestimation < .
	replace overestimation_cat=1 if overestimation< overestimation_mean - overestimation_stDev
	label var overestimation_cat "Overstimation in categories"	
	lab define over 1 "Low" 2 "Medium" 3 "High", modify 
	lab val overestimation_cat over
	label list over
tab overestimation_cat

gen gender=Sex if Sex==1 & Sex!=3
	replace gender=0 if Sex==2 & Sex!=3
	label var gender "Gender"	
	lab define gender 1 "Male" 0 "Female", modify 
	lab val gender gender 
	label list gender
tab gender


gen age_cat=1 if Age>=18 & Age<30
	replace age_cat=2 if Age>=30 & Age<40
	replace age_cat=3 if Age>=40 & Age<50
	replace age_cat=4 if Age>=50 & Age<60
	replace age_cat=5 if Age>=60 & Age<70
	replace age_cat=6 if Age>=70 

label var age_cat "Age in categories"	
	lab define age_cat 1 "18 to 29" 2 "30 to 39" 3 "40 to 49" 4 "50 to 59" 5 "60 to 69" 6 "70 or more", modify 
	lab val age_cat age_cat
	label list age_cat

gen education=1 if educ==0 | educ==1 | educ==2 | educ==3 | educ==4 | educ==5 | educ==6 | educ==7
	replace education=2 if educ==8 | educ==9 | educ==10 | educ==11 
	replace education=3 if educ==12
	replace education=4 if educ==13 | educ==14 | educ==15 | educ==16
	replace education=5 if educ>=17 
	lab define education 1 "Primary or less" 2 "Some secondary" 3 "Secondary" 4 "Tertiary/some College-University" 5"College-Univesity or more", modify 
	lab val education education 
	label list education
tab education

label var marital_status "Marital Status"
	label define marital_status 1"Single" 2"Married/civil partnership" 3"Divorced/separated" 4"Widow/widower", modify
	lab val marital_status marital_status
	label list marital_status


*******************************************************************************
**#********* Principal Component Analysis Independent variables*******************
********************************************************************************
// cd "$output\PCA"
global vars2 nmor1 nmor2 nmor3 nmor4 
global vars3 nmor1 nmor2 nmor3 nmor4 overestimation_cat


pca $vars2, corr means
matrix list e(L)
esttab e(L)
esttab, ///
    cells("L[1](transpose) L[2](transpose) L[3](transpose) L[4](transpose)  Psi") ///
    nogap noobs nonumber nomtitle
esttab, ///
    cells("L[1](t label(Comp 1)) L[2](t) L[3](t) L[4](t) Psi") ///
    nogap noobs nonumber nomtitle
esttab, ///
    cells("L[Comp1](t) L[Comp2](t) L[Comp3](t) L[Comp4](t) Psi[Unexplained]") ///
    nogap noobs nonumber nomtitle
esttab using pca.tex, replace ///
    cells("L[Comp1](t) L[Comp2](t) L[Comp3](t) L[Comp4](t) Psi[Unexplained]") ///
    nogap noobs nonumber nomtitle
	
predict pc1 pc2 
	rename pc1 PC1_indep
	rename pc2 PC2_indep
// estat smc
// estat loadings
// estat kmo

// loadingplot, components(2)
// graph export "$output/PCA/Loadingplot_PCA2.png", replace
*pcacoefsave using "$pca\pca_results_arm", replace

pca $vars3, corr means
esttab e(L), csv
predict pc1 pc2 
	rename pc1 PC1_indep2
	rename pc2 PC2_indep2
// estat smc
// estat loadings

// loadingplot, components(2)
// graph export "$output/PCA/Loadingplot_PCA3.png", replace
*pcacoefsave using "$pca\pca_results_arm", replace	
	


********************************************************************************
**#******** STANDARDIZE VARIABLES
********************************************************************************

global controls_1 "overestimation nmor1 nmor2 nmor3 nmor4 Age satisfaction trust_gov trust_pres trust_just trust_pol trust_others"
global dep_vars "arm3 arm4 arm5 arm6 narm7 narm8 narm9"
global dep_vars2 "Principal_Component_1 Principal_Component_2 PC1_indep PC1_indep2"
	
foreach var of  varlist $controls_1 { 
	egen `var'_std = std(`var')
}
	
foreach var of  varlist $dep_vars $dep_vars2 { 
	egen `var'_std = std(`var')
}
						
global depvars_std "arm3_std arm4_std arm5_std arm6_std narm7_std narm8_std narm9_std"

						
pca $depvars_std, corr means
esttab e(L), csv
predict pc1 pc2 pc3
// estat smc
// estat loadings

// loadingplot, components(2)
// graph export "$output/PCA/Loadingplot_PCA_std.png", replace

*pcacoefsave using "$pca\pca_results_arm", replace

// twoway 	function Principal_Component_1 = normalden(x, 1.41e-09, 1.669515), recast(area) range(-3.171643  4.084943) ///
// 			title("Density PC1 (red) y PC2 (blue)", size(medium)) ///
// 			subtitle("(assuming normal distribution)", size (small)) fcolor(red%15) ytitle("Density") lcolor(red)  xtitle("") legend(label(1 "PC1")) ///
// 			|| ///
// 			function Principal_Component_2 = normalden(x, 2.21e-09, 1.23824), recast(area) fcolor(b%15) recast(area) range(-5.033438  2.462579) ///
// 			lcolor(blue) ///
// 			legend(label(2 "PC2"))  xtitle("") || ///
//
//
// graph export "$output/PCA/PCA_DepVars_std.png", replace

			//We are keeping pc1 and pc2 as the eigenvalue is bigger than 1	
			// we could explore rotating the PC
			

********************************************************************************
**# INCOME						
********************************************************************************
egen income=rowmean(income_ar income_br income_cl income_co income_mx income_us)


gen missing_income=1 if income!=.
replace missing_income=0 if income==. & PAIS==3


********************************************************************************
**#* M.1 Further cleaning (Matias)
********************************************************************************

* PCA of overplacement variables without responsibility using weapons

pca nmor1 nmor2 nmor3
	predict overplace_pc_2
	la var overplace_pc_2 "Overplacement (PCA) - No weapons"
egen overplace_pc_std_2 = std(overplace_pc_2)
	la var overplace_pc_std_2 "Overplacement (PCA, standardized) - No weapons"

	* PCA of overplacement variables with responsibility using weapons
pca nmor1 nmor2 nmor3 nmor4
	predict overplace_pc_1
	la var overplace_pc_1 "Overplacement (PCA)"
egen overplace_pc_std_1 = std(overplace_pc_1)
	la var overplace_pc_std_1 "Overplacement (PCA, standardized)"

* Corrected overestimation index B

// Instead of considering as less overconfident someone who answers correctly and
// is really confident of their answers, as in Overconfidence_Index, I suggest
// treating them as non-overconfident, but treating as underconfident those who
// answer correctly but have no confidence of their answers.

// Individual question contribution to the index: -3 to 3 according to the table:

//   Level of confidence :  0    1    2    3 
// *---------------------*----*----*----*----*
// |      Correct answer | -3 | -2 | -1 |  0 |
// *---------------------*----*----*----*----*
// |        Wrong answer |  0 |  1 |  2 |  3 |
// *---------------------*----*----*----*----*

gen overestimB = ocon1c_R + ocon2c_R + ocon3c_R + ocon4c_R + ocon5c_R + ocon6c_R + ocon7c_R + ocon8c_R + ocon9c_R - 3*right_answers
	la var overestimB "Overestimation index B [-27;27]"


* Overestimation q by q (originally generated in the `Hard Tasks' do-file)
forvalues i=1(1)9{
	gen ocon`i'c_d=(ocon`i'c>=3) if ocon`i'c < .
}
egen ocon10_c=rowtotal(ocon1c_d ocon2c_d ocon3c_d ocon4c_d ocon5c_d ocon6c_d ocon7c_d ocon8c_d ocon9c_d)
gen overestimation_2=ocon10_c-right_answers
	la var overestimation_2 "Overestimation index (Alternative)"
egen overestimation_2_std = std(overestimation_2)
	la var overestimation_2_std "Overestimation index (Alternative)"

	
* Overplacement dummies

gen ovpl1_less = nmor1 == 0 if nmor1 < .
gen ovpl1_more = nmor1 == 1 if nmor1 < .
forvalues i = 2/4 {
	gen ovpl`i'_less = nmor`i' == 1 if nmor`i' < .
	gen ovpl`i'_eq = nmor`i' == 2 if nmor`i' < .
	gen ovpl`i'_more = nmor`i' == 3 if nmor`i' < .
}


* Use of guns dummies

gen arm7_never = arm7 == 1 if arm7 < 5
gen arm7_knife = arm7 == 2 if arm7 < 5
gen arm7_fire = arm7 == 3 if arm7 < 5
gen arm7_alws = arm7 == 4 if arm7 < 5
gen arm8_never = arm8 == 1 if arm8 < 5
gen arm8_knife = arm8 == 2 if arm8 < 5
gen arm8_fire = arm8 == 3 if arm8 < 5
gen arm8_alws = arm8 == 4 if arm8 < 5
gen arm9_never = arm9 == 1 if arm9 < 4
gen arm9_equal = arm9 == 2 if arm9 < 4
gen arm9_alws = arm9 == 3 if arm9 < 4

* Correcting encoding (again)
recode arm7 arm8 (5=.)
recode arm9 (4=.)
	
* Relabeling variables

la def arm4op 1 "Never" 2 "Only knife" 3 "Only firearm" 4 "Always"
la def arm3op 1 "Never" 2 "Only equal weapon" 3 "Always"
	la val arm7 arm4op
	la val arm8 arm4op
	la val arm9 arm3op

la var right_answers "Number of correct answers"
la var overestimation "Overestimation index"
la var overestimation_std "Overestimation index (standardized)"

save "$data/Crime_.dta", replace
// clear
